In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from prophet import Prophet
import geopandas as gpd
import plotly.express as px

# Load dataset
df = pd.read_json("cleaned_data.json")

### 📊 USER ENGAGEMENT ANALYSIS ###
df["engagement_score"] = df["score"] + (df["num_comments"] * 2)  # Weight comments more

plt.figure(figsize=(10, 5))
sns.histplot(df["engagement_score"], bins=50, kde=True)
plt.title("User Engagement Score Distribution")
plt.xlabel("Engagement Score")
plt.ylabel("Frequency")
plt.show()

### 🌍 LOCATION ANALYSIS ###
df["location"] = df["location"].fillna("Unknown")  # Handle missing locations

# Count posts by location
location_counts = df["location"].value_counts().reset_index()
location_counts.columns = ["Location", "Count"]

fig = px.bar(location_counts, x="Location", y="Count", title="Post Count by Location")
fig.show()

# If you want to visualize location data on a map:
# Ensure you have a dataset that maps locations to lat/lon
# Example: Indian states map (Uncomment below if applicable)
# india_map = gpd.read_file("india_states.geojson")  # Load India map GeoJSON
# india_map = india_map.merge(location_counts, left_on="state_name", right_on="Location", how="left")
# india_map.plot(column="Count", cmap="Blues", legend=True)
# plt.title("Engagement by State")
# plt.show()

### 📈 TREND ANALYSIS ###
df["created_utc"] = pd.to_datetime(df["created_utc"])  # Convert to datetime

# Group by date and aggregate scores to analyze trends
time_series = df.groupby(df["created_utc"].dt.date)["score"].sum().reset_index()
time_series.columns = ["ds", "y"]

# Apply Prophet for trend forecasting
model = Prophet()
model.fit(time_series)
future = model.make_future_dataframe(periods=30)  # Predict for next 30 days
forecast = model.predict(future)

fig = model.plot(forecast)
plt.title("Trend Analysis of Post Scores Over Time")
plt.show()
No description has been provided for this image
02:44:29 - cmdstanpy - INFO - Chain [1] start processing
02:44:30 - cmdstanpy - INFO - Chain [1] done processing
No description has been provided for this image
In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load dataset
df = pd.read_json("cleaned_data.json")

# Ensure 'text' column exists
if "text" not in df.columns:
    raise ValueError("Dataset must contain a 'text' column.")

### 🤖 SENTIMENT ANALYSIS USING TRANSFORMERS ###
sentiment_model = pipeline("sentiment-analysis")

df["huggingface_sentiment"] = df["text"].apply(lambda x: sentiment_model(x)[0]["label"])

### 🔥 SENTIMENT ANALYSIS USING VADER (Lexicon-based) ###
vader = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    score = vader.polarity_scores(text)["compound"]
    if score >= 0.05:
        return "POSITIVE"
    elif score <= -0.05:
        return "NEGATIVE"
    else:
        return "NEUTRAL"

df["vader_sentiment"] = df["text"].apply(vader_sentiment)

### 📊 SENTIMENT DISTRIBUTION ###
plt.figure(figsize=(10, 5))
sns.countplot(x=df["huggingface_sentiment"], palette="coolwarm")
plt.title("Sentiment Analysis (Hugging Face)")
plt.show()

plt.figure(figsize=(10, 5))
sns.countplot(x=df["vader_sentiment"], palette="viridis")
plt.title("Sentiment Analysis (VADER)")
plt.show()

### 🔍 PRINT SAMPLE RESULTS ###
print(df[["text", "huggingface_sentiment", "vader_sentiment"]].head(10))
Positive
In [15]:
### 🌍 LOCATION ANALYSIS ###
df["location"] = df["location"].fillna("Unknown")  # Handle missing locations

# Separate known and unknown locations
known_locations = df[df["location"] != "Unknown"]
unknown_locations = df[df["location"] == "Unknown"]

# Count posts by location
location_counts = known_locations["location"].value_counts().reset_index()
location_counts.columns = ["Location", "Count"]

# Add "Unknown" as a separate category
unknown_count = len(unknown_locations)
if unknown_count > 0:
    location_counts = pd.concat([location_counts, pd.DataFrame([{"Location": "Unknown", "Count": unknown_count}])], ignore_index=True)

# Bar plot for location counts
fig = px.bar(location_counts, x="Location", y="Count", title="Post Count by Location", text="Count")
fig.update_traces(textposition="outside")
fig.show()
In [16]:
### 🌍 CATEGORY VS LOCATION ANALYSIS ###

df["location"] = df["location"].fillna("Unknown")  # Handle missing locations
df["category"] = df["category"].fillna("Uncategorized")  # Handle missing categories

# Count posts by category and location
category_location_counts = df.groupby(["location", "category"]).size().reset_index(name="Count")

# Plot category-wise distribution across locations
fig = px.bar(
    category_location_counts,
    x="location",
    y="Count",
    color="category",
    title="Category Distribution by Location",
    text="Count",
    barmode="stack"
)

fig.update_traces(textposition="outside")
fig.show()
In [ ]: